python data preparation with siuba

Python
Author

Tony Duan

Published

October 2, 2023

## Package

Code
#python3 -m pip install siuba
Code
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns

from siuba import _, mutate, filter, group_by, summarize,show_query
from siuba import *
from siuba.data import mtcars,penguins
Code
small_mtcars = mtcars >> select(_.cyl, _.mpg, _.hp)>> head(5)
small_penguins=penguins>> head(5)

1 select

Code
small_mtcars >> select(_.cyl, _.mpg)
cyl mpg
0 6 21.0
1 6 21.0
2 4 22.8
3 6 21.4
4 8 18.7

exclude

Code
small_mtcars >> select(~_.cyl)
mpg hp
0 21.0 110
1 21.0 110
2 22.8 93
3 21.4 110
4 18.7 175

2 Renaming

Code
small_mtcars >> rename(new_name_mpg = _.mpg)
cyl new_name_mpg hp
0 6 21.0 110
1 6 21.0 110
2 4 22.8 93
3 6 21.4 110
4 8 18.7 175

3 Mutate

Code
mtcars.head()>> mutate(gear2 = _.gear+1)
mpg cyl disp hp drat wt qsec vs am gear carb gear2
0 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 5
1 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 5
2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 5
3 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 4
4 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 4

4 Filter

Code
mtcars>> filter(_.gear ==4)
mpg cyl disp hp drat wt qsec vs am gear carb
0 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
1 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
7 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
8 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
9 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
10 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
17 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
18 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
19 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
25 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
31 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2

Filters with OR conditions

Code
mtcars >> filter((_.cyl == 4) | (_.gear == 5))
mpg cyl disp hp drat wt qsec vs am gear carb
2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
7 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
8 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
17 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
18 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
19 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
20 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
25 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
26 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
27 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
28 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
29 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
30 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
31 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2

Dropping NA values

5 group by

Code
tbl_query = (mtcars
  >> group_by(_.cyl)
  >> summarize(avg_hp = _.hp.mean())
  )

tbl_query
cyl avg_hp
0 4 82.636364
1 6 122.285714
2 8 209.214286

6 order

Code
small_mtcars >> arrange(_.hp)
cyl mpg hp
2 4 22.8 93
0 6 21.0 110
1 6 21.0 110
3 6 21.4 110
4 8 18.7 175

Sort in descending order

Code
small_mtcars >> arrange(-_.hp)
cyl mpg hp
4 8 18.7 175
0 6 21.0 110
1 6 21.0 110
3 6 21.4 110
2 4 22.8 93

Arrange by multiple variables

Code
small_mtcars >> arrange(_.cyl, -_.mpg)
cyl mpg hp
2 4 22.8 93
3 6 21.4 110
0 6 21.0 110
1 6 21.0 110
4 8 18.7 175

7 using siuba with database

Code
from sqlalchemy import create_engine
from siuba.sql import LazyTbl
from siuba import _, group_by, summarize, show_query, collect 
from siuba.data import mtcars
Code
# copy in to sqlite, using the pandas .to_sql() method
engine = create_engine("sqlite:///:memory:")
mtcars.to_sql("mtcars", engine, if_exists = "replace")
32
Code
# Create a lazy SQL DataFrame
tbl_mtcars = LazyTbl(engine, "mtcars")
tbl_mtcars
# Source: lazy query
# DB Conn: Engine(sqlite:///:memory:)
# Preview:
index mpg cyl disp hp drat wt qsec vs am gear carb
0 0 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
1 1 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
2 2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
3 3 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
4 4 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2

# .. may have more rows

Code
# connect with siuba

tbl_query = (tbl_mtcars
  >> group_by(_.cyl)
  >> summarize(avg_hp = _.hp.mean())
  )

tbl_query
# Source: lazy query
# DB Conn: Engine(sqlite:///:memory:)
# Preview:
cyl avg_hp
0 4 82.636364
1 6 122.285714
2 8 209.214286

# .. may have more rows

Code
q = tbl_query >> show_query()
SELECT mtcars.cyl, avg(mtcars.hp) AS avg_hp 
FROM mtcars GROUP BY mtcars.cyl
Code
tbl_query >> collect()
cyl avg_hp
0 4 82.636364
1 6 122.285714
2 8 209.214286

8 Reference

https://siuba.org/